library(edgeR)
library(DESeq2)
library(ggplot2)
library(gplots)
library(RColorBrewer)
library(stringr)
library(cowplot)
library(scales)
library(pheatmap)
library(org.Mm.eg.db)
library(dplyr)fcounts.ensembl <- DGEList(counts=fcounts.ensembl.raw$counts,
genes = fcounts.ensembl.raw$annotation,
samples = targets$sample, group=targets$genotype)
colnames(fcounts.ensembl) <- str_extract(targets$sample,"JM[0-9]{1,2}")
fcounts.ensembl <- calcNormFactors(fcounts.ensembl)fcounts.noncode.raw <- readRDS("../FEATURECOUNTS/fcounts_ensembl_noncode_gene.stranded2.rds")
fcounts.noncode <- DGEList(counts=fcounts.noncode.raw$counts,
genes = fcounts.noncode.raw$annotation,
samples = targets$sample, group=targets$genotype)
colnames(fcounts.noncode) <- str_extract(targets$sample,"JM[0-9]{1,2}")
fcounts.noncode <- calcNormFactors(fcounts.noncode)For the plot –> https://github.com/tidyverse/ggplot2/wiki/legend-attributes
library(colorspace) ## hsv colorspace manipulations
desat <- function(cols, sat=0.5) {
X <- diag(c(1, sat, 1)) %*% rgb2hsv(col2rgb(cols))
hsv(X[1,], X[2,], X[3,])
}
# change colors to rcolorbrewer palette with more contrast for different groups
mycols <- c(brewer.pal(n = 8, name = "Dark2"),desat(brewer.pal(n = 8, name = "Set1")[1],0.7))
# swap Ntg and swim colors for better differences later on
mycols <- mycols[c(1,2,3,6,5,4,7,8,9)]bplot <- fcounts.ensembl$samples[c(11,22,30:36,1:10,12:21,23:29),]
bplot$ids <- str_extract(targets$sample,"JM[0-9]{1,2}")
bplot$ids <- factor(paste0("JM",c(1:36)),levels=paste0("JM",c(1:36)))
bplot$group <- factor(as.character(bplot$group),levels=unique(bplot$group))
p <- ggplot(bplot,aes(x=ids,y=lib.size*1e-6,fill=group)) +
geom_bar(stat = "identity",position = position_dodge()) +
theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust=0.5)) +
geom_abline(intercept=35, slope=0) +
ggtitle("Library Size per sample (ensembl mm10)") +
xlab ("Sample ID") +
ylab ("Library size (millions)") +
scale_fill_manual(values=mycols) +
theme(legend.text = element_text(size = 10),legend.key.size = unit(0.5, "lines"))bplot.nc <- fcounts.noncode$samples[c(11,22,30:36,1:10,12:21,23:29),]
bplot.nc$ids <- str_extract(targets$sample,"JM[0-9]{1,2}")
bplot.nc$ids <- factor(paste0("JM",c(1:36)),levels=paste0("JM",c(1:36)))
bplot.nc$group <- factor(as.character(bplot.nc$group),levels=unique(bplot.nc$group))
pnoncode <- ggplot(bplot.nc,aes(x=ids,y=lib.size*1e-6,fill=group)) +
geom_bar(stat = "identity",position = position_dodge()) +
theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust=0.5)) +
geom_abline(intercept=25, slope=0) +
ggtitle("Library Size per sample (noncode data)") +
xlab ("Sample ID") +
ylab ("Library size (millions)") +
scale_fill_manual(values=mycols) +
#guides(color = guide_legend(override.aes = list(size = 0.2))) +
theme(legend.text = element_text(size = 10),legend.key.size = unit(0.5, "lines"))
plot_grid(p,pnoncode,nrow=2)# bplot <- fcounts.ensembl$samples[c(11,22,30:36,1:10,12:21,23:29),]
# bplot$ids <- str_extract(targets$sample,"JM[0-9]{1,2}")
# bplot$ids <- factor(paste0("JM",c(1:36)),levels=paste0("JM",c(1:36)))
# bplot$group <- factor(as.character(bplot$group),levels=unique(bplot$group))
p <- ggplot(bplot[c(1:16),],aes(x=ids,y=lib.size*1e-6,fill=group)) +
geom_bar(stat = "identity",position = position_dodge()) +
theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust=0.5)) +
geom_abline(intercept=35, slope=0) +
ggtitle("Library Size per sample (ensembl mm10)") +
xlab ("Sample ID") +
ylab ("Library size (millions)") +
scale_fill_manual(values=mycols[c(1:4)]) +
theme(legend.text = element_text(size = 10),legend.key.size = unit(0.5, "lines"))
# bplot.nc <- fcounts.noncode$samples[c(11,22,30:36,1:10,12:21,23:29),]
# bplot.nc$ids <- str_extract(targets$sample,"JM[0-9]{1,2}")
# bplot.nc$ids <- factor(paste0("JM",c(1:36)),levels=paste0("JM",c(1:36)))
# bplot.nc$group <- factor(as.character(bplot.nc$group),levels=unique(bplot.nc$group))
pnoncode <- ggplot(bplot.nc[c(1:16),],aes(x=ids,y=lib.size*1e-6,fill=group)) +
geom_bar(stat = "identity",position = position_dodge()) +
theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust=0.5)) +
geom_abline(intercept=25, slope=0) +
ggtitle("Library Size per sample (noncode data)") +
xlab ("Sample ID") +
ylab ("Library size (millions)") +
scale_fill_manual(values=mycols[c(1:4)]) +
#guides(color = guide_legend(override.aes = list(size = 0.2))) +
theme(legend.text = element_text(size = 10),legend.key.size = unit(0.5, "lines"))
plot_grid(p,pnoncode,nrow=2)# bplot <- fcounts.ensembl$samples[c(11,22,30:36,1:10,12:21,23:29),]
# bplot$ids <- str_extract(targets$sample,"JM[0-9]{1,2}")
# bplot$ids <- factor(paste0("JM",c(1:36)),levels=paste0("JM",c(1:36)))
# bplot$group <- factor(as.character(bplot$group),levels=unique(bplot$group))
p <- ggplot(bplot[c(17:36),],aes(x=ids,y=lib.size*1e-6,fill=group)) +
geom_bar(stat = "identity",position = position_dodge()) +
theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust=0.5)) +
geom_abline(intercept=35, slope=0) +
ggtitle("Library Size per sample (ensembl mm10)") +
xlab ("Sample ID") +
ylab ("Library size (millions)") +
scale_fill_manual(values=mycols[c(5:9)]) +
theme(legend.text = element_text(size = 10),legend.key.size = unit(0.5, "lines"))
# bplot.nc <- fcounts.noncode$samples[c(11,22,30:36,1:10,12:21,23:29),]
# bplot.nc$ids <- str_extract(targets$sample,"JM[0-9]{1,2}")
# bplot.nc$ids <- factor(paste0("JM",c(1:36)),levels=paste0("JM",c(1:36)))
# bplot.nc$group <- factor(as.character(bplot.nc$group),levels=unique(bplot.nc$group))
pnoncode <- ggplot(bplot.nc[c(17:36),],aes(x=ids,y=lib.size*1e-6,fill=group)) +
geom_bar(stat = "identity",position = position_dodge()) +
theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust=0.5)) +
geom_abline(intercept=25, slope=0) +
ggtitle("Library Size per sample (noncode data)") +
xlab ("Sample ID") +
ylab ("Library size (millions)") +
scale_fill_manual(values=mycols[c(5:9)]) +
#guides(color = guide_legend(override.aes = list(size = 0.2))) +
theme(legend.text = element_text(size = 10),legend.key.size = unit(0.5, "lines"))
plot_grid(p,pnoncode,nrow=2)## [1] 53465 36
## [1] 141239 36
countsPerMillion <- cpm(dgList)
countCheck <- countsPerMillion > 1
keep <- which(rowSums(countCheck) >= 4)
dgList <- dgList[keep,]
dim(dgList)## [1] 14128 36
countsPerMillion.nc <- cpm(dgList.noncode)
countCheck.nc <- countsPerMillion.nc > 1
keep.nc <- which(rowSums(countCheck.nc) >= 4)
dgList.noncode <- dgList.noncode[keep.nc,]
dim(dgList.noncode)## [1] 17400 36
OLD!!
# # bplot$source <- "ensembl"
# # bplot.nc$source <- "noncode"
# reads.summary <- bplot
# rownames(reads.summary) <- NULL
# reads.summary <- cbind(reads.summary,read.table("../STAR_ALIGNMENT/summary_aligned_reads.txt"))
# reads.summary$V3 <- as.numeric(sub("%","",reads.summary$V3))
# # add count info
# ensembl.counts <- colSums(cpm(dgList) > 1)
# noncode.counts <- colSums(cpm(dgList.noncode) > 1)
# reads.summary$ensembl.counts <- ensembl.counts[match(reads.summary$ids,names(ensembl.counts))]
# reads.summary$noncode.counts <- noncode.counts[match(reads.summary$ids,names(noncode.counts))]
# reads.summary <- reads.summary %>%
# select(group,ids, V1,V2,V3,ensembl.counts,noncode.counts) %>%
# rename(sample=group,input.reads=V1,aligned.reads=V2, aligned.reads.perc=V3) %>%
# group_by(sample) %>%
# summarize(input_reads=mean(input.reads),
# unique_aligned_reads=mean(aligned.reads),
# perc_aligned_reads=round(mean(aligned.reads.perc),2),
# avg.ensembl.counts=mean(ensembl.counts),
# avg.noncode.counts=mean(noncode.counts)) %>%
# ungroup()
# write.csv(reads.summary,file="csv/reads_counts_summary.csv")# dgList$samples$lib.size <- colSums(dgList$counts)
# dgList <- calcNormFactors(dgList, method="TMM")
# colnames_OK <- targets$sample
#plotMDS(dgList, method="bcv", col=as.numeric(dgList$samples$group),labels=str_extract(targets$sample,"JM[0-9]{1,2}"),main=paste("MDS all samples - ",dim(dgList)[1]," genes",sep=""))# plot_cpm_boxplot <- function(my_countdata, my_condition, my_fcounts) {
# logcounts <- log2(my_countdata + 1)
# statusCol <- as.numeric(factor(my_condition))
# logcounts <- logcounts[,c(11,22,30:36,1:10,12:21,23:29)]
# statusCol <- statusCol[order(statusCol)]
# col <- hue_pal()(9)[statusCol]
# # Check distributions of samples using boxplots
# par(mfrow=c(2,1),oma=c(1,0,0,0) + 0.1, mar=c(2,5,1,1) + 0.1)
# boxplot(logcounts,
# xlab="",
# cex.axis=0.7,
# ylab="Log2(Counts)",
# las=2,
# col=col)
# # Let's add a blue horizontal line that corresponds to the median logCPM
# abline(h=median(as.matrix(logcounts)), col="blue")
# fcounts.ensembl.norm <- cpm(my_fcounts, normalized.lib.sizes=TRUE, log = TRUE)
# fcounts.ensembl.norm.reorder <- fcounts.ensembl.norm[,c(11,22,30:36,1:10,12:21,23:29)]
# boxplot(fcounts.ensembl.norm.reorder, col=col, las=2, cex.axis=0.7, ylab="log2CPM")
# abline(h=0, col="blue")
# }# countdata <- as.matrix(fcounts.ensembl.raw$counts)
# colnames(countdata) <- str_extract(targets$sample,"JM[0-9]{1,2}")
# condition <- factor(targets$genotype,levels=levels(targets$genotype)[c(1,2,3,5,4,7,6,8,9)])
# plot_cpm_boxplot(countdata,condition,fcounts.ensembl)# countdata.nc <- as.matrix(fcounts.noncode.raw$counts)
# colnames(countdata.nc) <- str_extract(targets$sample,"JM[0-9]{1,2}")
# plot_cpm_boxplot(countdata.nc,condition,fcounts.noncode)https://bioconductor.org/packages/devel/bioc/vignettes/DEFormats/inst/doc/DEFormats.html
countdata <- as.matrix(fcounts.ensembl.raw$counts)
colnames(countdata) <- str_extract(targets$sample,"JM[0-9]{1,2}")
condition <- factor(targets$genotype,levels=levels(targets$genotype)[c(1,2,3,5,4,7,6,8,9)])
coldata <- data.frame(row.names=colnames(countdata), condition)
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)Reorder dds to have the data correctly sorted
dds <- dds[,c(11,22,30:36,1:10,12:21,23:29)]
condition <- condition[c(11,22,30:36,1:10,12:21,23:29)]
c(11,22,30:36,1:10,12:21,23:29)## [1] 11 22 30 31 32 33 34 35 36 1 2 3 4 5 6 7 8 9 10 12 13 14 15 16 17
## [26] 18 19 20 21 23 24 25 26 27 28 29
Run the DESeq pipeline * Remove first the “uncounted” genes: cpm > 1 at least in 4 samples
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plot_cpm_boxplot <- function(my_dds, my_condition) {
mycols_4reps <- mycols[as.numeric(factor(condition))]
# logcounts <- log2(assay(my_dds) + 1)
logcounts <- log2(counts(my_dds,normalized=FALSE) + 1)
par(mfrow=c(2,1),oma=c(1,0,0,0) + 0.1, mar=c(2,5,1,1) + 0.1)
boxplot(logcounts,
xlab="Sample ID",
ylab="Log2(Counts)",
las=2, cex.axis=0.7,
col=mycols_4reps)
abline(h=median(as.matrix(logcounts)), col="black")
logcounts <- log2(counts(dds,normalized=TRUE) + 1)
boxplot(logcounts,
xlab="Sample ID",
ylab="Log2(Counts) - Norm",
las=2, cex.axis=0.7,
col=mycols_4reps)
abline(h=median(as.matrix(logcounts)), col="black")
}countdata.nc <- as.matrix(fcounts.noncode.raw$counts)
colnames(countdata.nc) <- str_extract(targets$sample,"JM[0-9]{1,2}")
condition <- factor(targets$genotype,levels=levels(targets$genotype)[c(1,2,3,5,4,7,6,8,9)])
coldata.nc <- data.frame(row.names=colnames(countdata.nc), condition)
dds.nc <- DESeqDataSetFromMatrix(countData=countdata.nc, colData=coldata.nc, design=~condition)Reorder dds to have the data correctly sorted
dds.nc <- dds.nc[,c(11,22,30:36,1:10,12:21,23:29)]
condition <- condition[c(11,22,30:36,1:10,12:21,23:29)]
dds.nc.SCOT <- dds.ncRun the DESeq pipeline * Remove first the “uncounted” genes: cpm > 1 at least in 4 samples
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Here is where I correct following Scot’s directions: I will only use the NONCODE lncRNA from the second pass and combine them with everything else from the ensembl gtf.
Commented lines are tests, will keep just in case.
# dim(dds)
# dim(dds.nc)
# k <- rownames(dds)
# knc <- rownames(dds.nc)
# length(k)
# length(knc)
# length(knc %in% k)
# sum(knc %in% k)
# nonk <- knc[!knc %in% k]
# length(nonk)
# sum(str_detect(nonk,"NONMMU"))
# dds.nc[str_detect(rownames(dds.nc),"NONMMU")]
ddsNEW <- dds.nc.SCOT[str_detect(rownames(dds.nc.SCOT),"NONMMU")]
ddsOK <- rbind(dds.SCOT,ddsNEW)
ddsOK <- ddsOK[rowSums(fpm(ddsOK)>1)>=4]
ddsOK <- DESeq(ddsOK)## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Sample distance heatmap
vst <- vst(ddsOK)
vst.nc <- vst
vst.OK <- vst
# mycols <- hue_pal()(9)[1:length(unique(condition))]
sampleDists_vst <- as.matrix(dist(t(assay(vst))))# default
lmat=rbind(4:3,2:1)
lwid = c(1.5,4)
lhei = c(1.5,4)
# key on the bottom
# lmat = rbind(c(0,3),c(2,1),c(0,4))
# lwid = c(1.5,4)
# lhei = c(1.5,4,1)
# when using colside and rowsidecolors, the numbers change
# https://stackoverflow.com/questions/15351575/moving-color-key-in-r-heatmap-2-function-of-gplots-package
## 1. Heatmap,
## 2. Row dendrogram,
## 3. Column dendrogram,
## 4. Key
# to:
## 1. Rowsidecols
## 2. Colsidecols
## 3. heatmap
## 4. row dend
## 5. col dend
## 6. key
######################
lmat=rbind( c(6,0,5),c(0,0,2),c(4,1,3) )
lhei=c(2,0.2,4)
lwid=c(2,0.13,4)
heatmap.2(rescale(as.matrix(sampleDists_vst)), trace="none",
key=T, key.title="Sample-sample distance",
keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
key.par=list(mgp=c(1.2, 0.5, 0),
mar=c(10, 3, 1.5, 3),
cex=0.6),
lmat=lmat,
lwid = lwid,
lhei = lhei,
col=colorpanel(100, "black", "white"),
ColSideColors=mycols[condition], RowSideColors=mycols[condition],
margin=c(4, 4),
main="Sample Distances Matrix")
legend(x=-0.08,y=0.92,xpd=T, legend=unique(condition), col=mycols,lty=1,lwd=5,cex=0.6)Heatmap of particular groups to compare, as Jenny asked on the begining of May:
groups_to_plot <- c(25:36)
condition_plot <- condition[groups_to_plot]
heatmap.2(rescale(as.matrix(sampleDists_vst[groups_to_plot,groups_to_plot])), trace="none",
symkey=F,
key=T, key.title="Sample-sample distance",
keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
key.par=list(mgp=c(1.2, 0.5, 0),
mar=c(9, 3, 2.5, 3),
cex=0.6),
lmat=lmat,
lwid = lwid,
lhei = lhei,
col=colorpanel(100, "black", "white"), labRow = F,labCol = F,
ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
margin=c(4, 4),
main="Sample Distances Matrix - TAC Group")
legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)# heatmap.2(as.matrix(sampleDists_vst[groups_to_plot,groups_to_plot]), trace="none",
# symkey=F,
# key=T, key.title="Sample-sample distance",
# keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
# key.par=list(mgp=c(1.2, 0.5, 0),
# mar=c(9, 3, 2.5, 3),
# cex=0.6),
# lmat=lmat,
# lwid = lwid,
# lhei = lhei,
# col=colorpanel(100, "black", "white"), labRow = F,labCol = F,
# ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
# margin=c(4, 4),
# main="Sample Distances Matrix - TAC Group")
# legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)groups_to_plot <- c(25:36)
condition_plot <- condition[groups_to_plot]
# log.norm.counts <- log2(counts(ddsOK,normalized=T))
log.fpm <- log2(fpm(ddsOK))
corr_coeff.logfpm <- stats::cor(replace(log.fpm,is.infinite(log.fpm),NA),method="pearson",use="pairwise.complete.obs")
# corr_coeff.logfpm.complete <- stats::cor(replace(log.fpm,is.infinite(log.fpm),NA),method="pearson",use="complete.obs")
# corr_coeff.lognormcounts <- stats::cor(replace(log.norm.counts,is.infinite(log.fpm),NA),method="pearson",use="pairwise.complete.obs")
# corr_coeff.lognormcounts.complete <- stats::cor(replace(log.norm.counts,is.infinite(log.fpm),NA),method="pearson",use="complete.obs")
# plots
# as.dist(rescale(1-corr_coeff.logfpm[groups_to_plot,groups_to_plot]), upper = TRUE) %>%
# as.matrix %>%
# pheatmap::pheatmap(., main = "Pearson correlation")
heatmap.2(corr_coeff.logfpm[groups_to_plot,groups_to_plot], trace="none",
symkey=F,
key=T, key.title="Pearson Correlation",
keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Pearson Correlation",
key.par=list(mgp=c(1.2, 0.5, 0),
mar=c(9, 3, 2.5, 3),
cex=0.6),
lmat=lmat,
lwid = lwid,
lhei = lhei,
col=rev(colorpanel(100, "black", "white")),
ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
margin=c(4, 4),
main="Sample correlations - TAC Group")
legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)# heatmap.2(rescale(1-corr_coeff.logfpm.complete[groups_to_plot,groups_to_plot]), trace="none",
# symkey=F,
# key=T, key.title="Sample-sample distance",
# keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
# key.par=list(mgp=c(1.2, 0.5, 0),
# mar=c(9, 3, 2.5, 3),
# cex=0.6),
# lmat=lmat,
# lwid = lwid,
# lhei = lhei,
# col=colorpanel(100, "black", "white"),
# ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
# margin=c(4, 4),
# main="Sample Distances Matrix - TAC Group")
# legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)
#
# heatmap.2(rescale(1-corr_coeff.lognormcounts[groups_to_plot,groups_to_plot]), trace="none",
# symkey=F,
# key=T, key.title="Sample-sample distance",
# keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
# key.par=list(mgp=c(1.2, 0.5, 0),
# mar=c(9, 3, 2.5, 3),
# cex=0.6),
# lmat=lmat,
# lwid = lwid,
# lhei = lhei,
# col=colorpanel(100, "black", "white"),
# ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
# margin=c(4, 4),
# main="Sample Distances Matrix - TAC Group")
# legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)
#
# heatmap.2(rescale(1-corr_coeff.lognormcounts.complete[groups_to_plot,groups_to_plot]), trace="none",
# symkey=F,
# key=T, key.title="Sample-sample distance",
# keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
# key.par=list(mgp=c(1.2, 0.5, 0),
# mar=c(9, 3, 2.5, 3),
# cex=0.6),
# lmat=lmat,
# lwid = lwid,
# lhei = lhei,
# col=colorpanel(100, "black", "white"),
# ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
# margin=c(4, 4),
# main="Sample Distances Matrix - TAC Group")
# legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)groups_to_plot <- c(17:24)
condition_plot <- condition[groups_to_plot]
heatmap.2(rescale(as.matrix(sampleDists_vst[groups_to_plot,groups_to_plot])), trace="none",
key=T, key.title="Sample-sample distance",
keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
key.par=list(mgp=c(1.2, 0.5, 0),
mar=c(9, 3, 2.5, 3),
cex=0.6),
lmat=lmat,
lwid = lwid,
lhei = lhei,
col=colorpanel(100, "black", "white"), labRow = F,labCol = F,
ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
margin=c(4, 4),
main="Sample Distances Matrix - Swim Group")
legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)groups_to_plot <- c(1:8,13:16)
condition_plot <- condition[groups_to_plot]
heatmap.2(rescale(as.matrix(sampleDists_vst[groups_to_plot,groups_to_plot])), trace="none",
key=T, key.title="Sample-sample distance",
keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
key.par=list(mgp=c(1.2, 0.5, 0),
mar=c(9, 3, 2.5, 3),
cex=0.6),
lmat=lmat,
lwid = lwid,
lhei = lhei,
col=colorpanel(100, "black", "white"), labRow = F,labCol = F,
ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
margin=c(4, 4),
main="Sample Distances Matrix - PI3K Group")
legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)groups_to_plot <- c(1:16)
condition_plot <- condition[groups_to_plot]
heatmap.2(rescale(as.matrix(sampleDists_vst[groups_to_plot,groups_to_plot])), trace="none",
key=T, key.title="Sample-sample distance",
keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
key.par=list(mgp=c(1.2, 0.5, 0),
mar=c(9, 3, 2.5, 3),
cex=0.6),
lmat=lmat,
lwid = lwid,
lhei = lhei,
col=colorpanel(100, "black", "white"), labRow = F,labCol = F,
ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
margin=c(4, 4),
main="Sample Distances Matrix - PI3K+IGF1R")
legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)heatmap.2(rescale(as.matrix(sampleDists_vst[groups_to_plot,groups_to_plot])), trace="none",
key=T, key.title="Sample-sample distance",
keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
key.par=list(mgp=c(1.2, 0.5, 0),
mar=c(9, 3, 2.5, 3),
cex=0.6),
lmat=lmat,
lwid = lwid,
lhei = lhei,
col=colorpanel(100, "black", "white"),
ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
margin=c(4, 4),
main="Sample Distances Matrix - PI3K+IGF1R")
legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)Get info from biomart and combine with the “genes” variable
library(biomaRt)
mart <- useMart("ensembl", dataset="mmusculus_gene_ensembl")
attributes <- c("ensembl_gene_id","gene_biotype","external_gene_name","entrezgene_id","mgi_id")
g <- getBM(attributes=attributes,filters="ensembl_gene_id",values=rownames(ddsOK),mart=mart,uniqueRows = T)
# g[which(g$ensembl_gene_id == "ENSMUSG00000113178"),]
# g[which(g$ensembl_gene_id == "ENSMUSG00000097383"),]
# g[which(g$entrezgene_id == 15455),]
# g[which(g$mgi_id == "MGI:96220"),]res.nc <- results(ddsOK)
res.nc$symbol <- mapIds(org.Mm.eg.db,
keys=row.names(res.nc),
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")## 'select()' returned 1:many mapping between keys and columns
topVarGenes <- head(order(rowVars(assay(vst.nc)), decreasing = TRUE), 20)
mat <- assay(vst.nc)[ topVarGenes, ]
mat <- mat - rowMeans(mat)
rownames(mat) <- res.nc$symbol[topVarGenes]
anno <- as.data.frame(colData(vst.nc)[, "condition"])
rownames(anno) <- colnames(mat)
colnames(anno) <- "Condition"
pheatmap(mat, annotation_col = anno)Plot all groups and also depending on the subset (e.g. Swim vs Non-swim)
plot_PCA <- function(my_vst){
z0 <- DESeq2::plotPCA(my_vst, intgroup="condition",ntop=2000) +
geom_point(size=3) +
stat_ellipse(type='t') +
scale_color_manual(values=mycols) +
guides(color = guide_legend(override.aes = list(linetype = 0))) +
theme_minimal() +
theme(legend.text = element_text(size = 10),
legend.key.size = unit(1, "lines"),
legend.title = element_blank(),
axis.text.x=element_text(size=rel(1.5)),
axis.title.x=element_text(size=rel(1.5)),
axis.text.y=element_text(size=rel(1.5)),
axis.title.y=element_text(size=rel(1.5)),
plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
legend.position="bottom")
z1 <- DESeq2::plotPCA(my_vst[,c(1:16)], intgroup="condition",ntop=2000) +
geom_point(size=3) +
stat_ellipse(type='t') +
scale_color_manual(values=unique(mycols[my_vst[,c(1:16)]$condition])) +
theme_minimal() +
# geom_text(show.legend=F,aes(label = colnames(vst[,c(1:16)])),nudge_y=c(rep(1,8))) +
labs(color="Sample")+
guides(color = guide_legend(override.aes = list(linetype = 0))) +
theme(legend.text = element_text(size = 10),
legend.key.size = unit(1, "lines"),
legend.title = element_blank(),
axis.text.x=element_text(size=rel(1.5)),
axis.title.x=element_text(size=rel(1.5)),
axis.text.y=element_text(size=rel(1.5)),
axis.title.y=element_text(size=rel(1.5)),
plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
legend.position="bottom")
z1n <- DESeq2::plotPCA(my_vst[,c(1:8,13:16)], intgroup="condition",ntop=2000) +
geom_point(size=3) +
stat_ellipse(type='t') +
scale_color_manual(values=unique(mycols[my_vst[,c(1:8,13:16)]$condition])) +
theme_minimal() +
# geom_text(show.legend=F,aes(label = colnames(vst[,c(1:16)])),nudge_y=c(rep(1,8))) +
labs(color="Sample")+
guides(color = guide_legend(override.aes = list(linetype = 0))) +
theme(legend.text = element_text(size = 10),
legend.key.size = unit(1, "lines"),
legend.title = element_blank(),
axis.text.x=element_text(size=rel(1.5)),
axis.title.x=element_text(size=rel(1.5)),
axis.text.y=element_text(size=rel(1.5)),
axis.title.y=element_text(size=rel(1.5)),
plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
legend.position="bottom")
z2 <- DESeq2::plotPCA(my_vst[,c(17:24)], intgroup="condition",ntop=2000) +
geom_point(size=3) +
stat_ellipse(type='t') +
scale_color_manual(values=unique(mycols[my_vst[,c(17:24)]$condition])) +
theme_minimal() +
# geom_text(show.legend=F,aes(label = colnames(vst[,c(1:16)])),nudge_y=c(rep(1,8))) +
labs(color="Sample")+
guides(color = guide_legend(override.aes = list(linetype = 0))) +
theme(legend.text = element_text(size = 10),
legend.key.size = unit(1, "lines"),
legend.title = element_blank(),
axis.text.x=element_text(size=rel(1.5)),
axis.title.x=element_text(size=rel(1.5)),
axis.text.y=element_text(size=rel(1.5)),
axis.title.y=element_text(size=rel(1.5)),
plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
legend.position="bottom")
z3 <- DESeq2::plotPCA(my_vst[,c(25:36)], intgroup="condition",ntop=2000) +
geom_point(size=3) +
stat_ellipse(type='t') +
scale_color_manual(values=unique(mycols[my_vst[,c(25:36)]$condition])) +
theme_minimal() +
# geom_text(show.legend=F,aes(label = colnames(vst[,c(1:16)])),nudge_y=c(rep(1,8))) +
labs(color="Sample")+
guides(color = guide_legend(override.aes = list(linetype = 0))) +
theme(legend.text = element_text(size = 10),
legend.key.size = unit(1, "lines"),
legend.title = element_blank(),
axis.text.x=element_text(size=rel(1.5)),
axis.title.x=element_text(size=rel(1.5)),
axis.text.y=element_text(size=rel(1.5)),
axis.title.y=element_text(size=rel(1.5)),
plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
legend.position="bottom")
#all controls
z4 <- DESeq2::plotPCA(my_vst[,c(13:20,25:28)], intgroup="condition",ntop=2000) +
geom_point(size=3) +
stat_ellipse(type='t') +
scale_color_manual(values=unique(mycols[my_vst[,c(13:20,25:28)]$condition])) +
theme_minimal() +
# geom_text(show.legend=F,aes(label = colnames(vst[,c(1:16)])),nudge_y=c(rep(1,8))) +
labs(color="Sample")+
guides(color = guide_legend(override.aes = list(linetype = 0))) +
theme(legend.text = element_text(size = 10),
legend.key.size = unit(1, "lines"),
legend.title = element_blank(),
axis.text.x=element_text(size=rel(1.5)),
axis.title.x=element_text(size=rel(1.5)),
axis.text.y=element_text(size=rel(1.5)),
axis.title.y=element_text(size=rel(1.5)),
plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
legend.position="bottom")
#swim vs igf1r vs tacsev
z5 <- DESeq2::plotPCA(my_vst[,c(9:12,21:24,33:36)], intgroup="condition",ntop=2000) +
geom_point(size=3) +
stat_ellipse(type='t') +
scale_color_manual(values=unique(mycols[my_vst[,c(9:12,21:24,33:36)]$condition])) +
theme_minimal() +
# geom_text(show.legend=F,aes(label = colnames(vst[,c(1:16)])),nudge_y=c(rep(1,8))) +
labs(color="Sample")+
guides(color = guide_legend(override.aes = list(linetype = 0))) +
theme(legend.text = element_text(size = 10),
legend.key.size = unit(1, "lines"),
legend.title = element_blank(),
axis.text.x=element_text(size=rel(1.5)),
axis.title.x=element_text(size=rel(1.5)),
axis.text.y=element_text(size=rel(1.5)),
axis.title.y=element_text(size=rel(1.5)),
plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
legend.position="bottom")
# z2 + geom_text(show.legend=F,aes(label = colnames(vst[,c(17:24)])),nudge_y=c(rep(1,8))) + labs(color="Sample")
# plot_grid(z0,z1,z2,z3,z4,z5,ncol=3,nrow=2)
plot(z0)
plot(z1)
plot(z1n)
plot(z2)
plot(z3)
plot(z4)
plot(z5)
}DESeq2::plotPCA(vst[,c(1:16)], intgroup="condition",ntop=2000) +
geom_point(size=3) +
stat_ellipse(type='t') +
scale_color_manual(values=unique(mycols[vst[,c(1:16)]$condition])) +
# scale_color_manual(values=c(unique(mycols[vst[,c(1:12)]$condition]),"black")) +
theme_minimal() +
geom_text(show.legend=F,aes(label = colnames(vst[,c(1:16)])),colour=c(mycols[vst[,c(1:12)]$condition],rep("black",4)),
nudge_x=c(7,-7,8,8, 8,8,-7,-8, 7,10,0,10, 0,-10,-5,10),nudge_y=c(-3,4,0,0, 0,0,3,0, -2,0,5,0, -5,0,-5,0)) +
labs(color="Sample")+
guides(color = guide_legend(override.aes = list(linetype = 0))) +
theme(legend.text = element_text(size = 10),
legend.key.size = unit(1, "lines"),
legend.title = element_blank(),
axis.text.x=element_text(size=rel(1.5)),
axis.title.x=element_text(size=rel(1.5)),
axis.text.y=element_text(size=rel(1.5)),
axis.title.y=element_text(size=rel(1.5)),
plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
legend.position="bottom")And the associated Screeplot
plot_PCA_screeplot <- function (my_vst) {
rv = rowVars(assay(my_vst))
selgenes=dim(my_vst)[1]
select = order(rv, decreasing = TRUE)[seq_len(min(selgenes, length(rv)))]
pca = prcomp(t(assay(my_vst)[select, ]))
## the contribution to the total variance for each component
percentVar <- pca$sdev^2 / sum( pca$sdev^2 )
##plot the "percentVar"
scree_plot=data.frame(percentVar)
scree_plot[,2]<- c(1:36)
colnames(scree_plot)<-c("variance","component_number")
p <- ggplot(scree_plot[1:5,], mapping=aes(x=component_number, y=variance))+geom_bar(stat="identity")+labs(x="PC")+ scale_x_continuous(breaks=c(1:5),labels=c(1:5)) +
geom_segment(aes(x=1,y=scree_plot$variance[1],xend=2,yend=sum(scree_plot$variance[1:2]))) +
geom_segment(aes(x=2,y=sum(scree_plot$variance[1:2]),xend=3,yend=sum(scree_plot$variance[1:3]))) +
geom_segment(aes(x=3,y=sum(scree_plot$variance[1:3]),xend=4,yend=sum(scree_plot$variance[1:4]))) +
geom_segment(aes(x=4,y=sum(scree_plot$variance[1:4]),xend=5,yend=sum(scree_plot$variance[1:5])))
return (p)
}Ideally, I will finish the differential analysis for all the different groups:
https://bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html
Explanation about how to call “results” and how to extract all different comparisons: https://support.bioconductor.org/p/98346/ https://rstudio-pubs-static.s3.amazonaws.com/329027_593046fb6d7a427da6b2c538caf601e1.html https://www.biostars.org/p/145211/
To see how an “easy case” works
##
## FALSE TRUE
## 14094 2376
##
## out of 16471 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1271, 7.7%
## LFC < 0 (down) : 1809, 11%
## outliers [1] : 1, 0.0061%
## low counts [2] : 0, 0%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Without anything, is the same than the first column vs the last one:
res <- results(ddsOK,contrast=c("condition","caPI3K","TACSEV"),alpha=0.05,lfcThreshold = 0)
table(res$padj<0.05)##
## FALSE TRUE
## 14094 2376
##
## out of 16471 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1455, 8.8%
## LFC < 0 (down) : 921, 5.6%
## outliers [1] : 1, 0.0061%
## low counts [2] : 0, 0%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
But we want to specify which columns to compare:
res <- results(ddsOK,contrast=c("condition","IGF1R","Ntg"),alpha=0.05,lfcThreshold = 0)
res$symbol <- mapIds(org.Mm.eg.db,
keys=row.names(res),
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")## 'select()' returned 1:many mapping between keys and columns
res$entrezid <- mapIds(org.Mm.eg.db,
keys=row.names(res),
column="ENTREZID",
keytype="ENSEMBL",
multiVals="first")## 'select()' returned 1:many mapping between keys and columns
res$refseq <- mapIds(org.Mm.eg.db,
keys=row.names(res),
column="REFSEQ",
keytype="ENSEMBL",
multiVals="first")## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 11454 5016
Change colors from heatmap to highlight the ones you want to compare
# library(colorspace) ## hsv colorspace manipulations
# desat <- function(cols, sat=0.5) {
# X <- diag(c(1, sat, 1)) %*% rgb2hsv(col2rgb(cols))
# hsv(X[1,], X[2,], X[3,])
# }
# mycols <- hue_pal()(9)[1:length(unique(condition))]
mycols_desat <- desat(mycols,0.3)anno.neworder <- union(which(colData(vst)[, "condition"] %in% c("IGF1R","Ntg")),c(1:dim(colData(vst))[1]))
mat <- assay(vst)[ head(order(res$padj),20), anno.neworder]
mat <- mat - rowMeans(mat)
rownames(mat) <- res[match(rownames(mat),rownames(res)),"symbol"]
anno <- as.data.frame(colData(vst)[anno.neworder, "condition"])
rownames(anno) <- colnames(mat)
colnames(anno) <- "Condition"
anno_colors <- c(mycols[c(1,2)],mycols_desat[3:length(mycols_desat)])
names(anno_colors) <- unique(anno[,1])
anno_colors <- list(Condition=anno_colors)
pheatmap(mat, cluster_cols=F, annotation_col = anno, annotation_colors=anno_colors)## DataFrame with 9 rows and 2 columns
## type description
## <character> <character>
## baseMean intermediate mean of normalized counts for all samples
## log2FoldChange results log2 fold change (MLE): condition IGF1R vs Ntg
## lfcSE results standard error: condition IGF1R vs Ntg
## stat results Wald statistic: condition IGF1R vs Ntg
## pvalue results Wald test p-value: condition IGF1R vs Ntg
## padj results BH adjusted p-values
## symbol NA NA
## entrezid NA NA
## refseq NA NA
##
## out of 16471 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 2252, 14%
## LFC < 0 (down) : 2764, 17%
## outliers [1] : 1, 0.0061%
## low counts [2] : 0, 0%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
topGenes <- rownames(res)[head(order(res$padj),10)]
topGenes <- res$symbol[head(order(res$padj),10)]
topGene <- topGenes[2]
data<-plotCounts(ddsOK, names(topGene), "condition",returnData=T)
# data$count <- log2(fpm(dds)[which(rownames(res) == topGene),])
data$count <- log2(fpm(ddsOK)[which(res$symbol == topGene),])
ggplot(data, aes(x=condition, y=count, color=condition))+
geom_boxplot(alpha=0,width=0.2,show.legend=FALSE) +
scale_color_manual(values=mycols) +
#scale_y_log10() +
geom_point(position=position_jitter(width=.1,height=0))+
labs(color="Group",x="Group",y="CPM (log2 transformed)") +
ggtitle(paste(topGene,": Gene Expression")) +
ylim(c(-10,15))## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
topGene <- "Igf1r"
names(topGene) <- "ENSMUSG00000005533"
# topGene <- "1500026H17Rik"
# names(topGene) <- "ENSMUSG00000097383"
data<-plotCounts(ddsOK, names(topGene), "condition",returnData=T)
data$count <- log2(fpm(ddsOK)[which(res$symbol == topGene),])
ggplot(data, aes(x=condition, y=count, color=condition))+
geom_boxplot(alpha=0,width=0.2,show.legend=FALSE) +
scale_color_manual(values=mycols) +
#scale_y_log10() +
geom_point(position=position_jitter(width=.1,height=0))+
labs(color="Group",x="Group",y="CPM (log2 transformed)") +
ggtitle(paste(topGene,": Gene Expression")) +
ylim(c(-10,15))topGene <- "Chaer"
names(topGene) <- "ENSMUSG00000106783"
# topGene <- "1500026H17Rik"
# names(topGene) <- "ENSMUSG00000097383"
data<-plotCounts(ddsOK, "ENSMUSG00000005533", "condition",returnData=T)
data$count <- log2(cpm(fcounts.ensembl)[rownames(cpm(fcounts.ensembl) ) == names(topGene),])[c(11,22,30:36,1:10,12:21,23:29)]
ggplot(data, aes(x=condition, y=count, color=condition))+
geom_boxplot(alpha=0,width=0.2,show.legend=FALSE) +
scale_color_manual(values=mycols) +
#scale_y_log10() +
geom_point(position=position_jitter(width=.1,height=0))+
labs(color="Group",x="Group",y="CPM (log2 transformed)") +
ggtitle(paste(topGene,": Gene Expression")) +
ylim(c(-10,15))## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
topGene <- "Chast"
names(topGene) <- "ENSMUSG00000085733"
# topGene <- "1500026H17Rik"
# names(topGene) <- "ENSMUSG00000097383"
data<-plotCounts(ddsOK, "ENSMUSG00000005533", "condition",returnData=T)
data$count <- log2(cpm(fcounts.ensembl)[rownames(cpm(fcounts.ensembl) ) == names(topGene),])[c(11,22,30:36,1:10,12:21,23:29)]
ggplot(data, aes(x=condition, y=count, color=condition))+
geom_boxplot(alpha=0,width=0.2,show.legend=FALSE) +
scale_color_manual(values=mycols) +
#scale_y_log10() +
geom_point(position=position_jitter(width=.1,height=0))+
labs(color="Group",x="Group",y="CPM (log2 transformed)") +
ggtitle(paste(topGene,": Gene Expression")) +
ylim(c(-10,15))Gets filtered out because not fpm vs IGFR very big
k <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
sum(rownames(k) == "ENSMUSG00000097383")## [1] 1
## [1] 28462
## class: DESeqDataSet
## dim: 1 36
## metadata(1): version
## assays(1): counts
## rownames(1): ENSMUSG00000097383
## rowData names(0):
## colnames(36): JM10 JM11 ... JM8 JM9
## colData names(1): condition
## JM1 JM2 JM3 JM4 JM5 JM6 JM7 JM8 JM9 JM10 JM11 JM12 JM13 JM14 JM15 JM16
## 1 10 13 7 26 11 16 19 13 9 16 13 10 4 6 22
## JM17 JM18 JM19 JM20 JM21 JM22 JM23 JM24 JM25 JM26 JM27 JM28 JM29 JM30 JM31 JM32
## 17 16 7 5 17 17 15 5 13 3 5 13 9 5 1 19
## JM33 JM34 JM35 JM36
## 9 8 8 10
## JM1 JM2 JM3 JM4 JM5 JM6 JM7
## 0.03436150 0.22485331 0.27537107 0.15724851 0.48052464 0.22333618 0.26751923
## JM8 JM9 JM10 JM11 JM12 JM13 JM14
## 0.32200053 0.27070620 0.15351840 0.27607963 0.26895207 0.23579399 0.11205421
## JM15 JM16 JM17 JM18 JM19 JM20 JM21
## 0.11398658 0.37883915 0.41730730 0.41534225 0.16651119 0.16776974 0.47478136
## JM22 JM23 JM24 JM25 JM26 JM27 JM28
## 0.45363042 0.40765894 0.17513783 0.38008219 0.08886349 0.14916243 0.29686517
## JM29 JM30 JM31 JM32 JM33 JM34 JM35
## 0.27296094 0.12644871 0.03427961 0.34750087 0.25777708 0.22298386 0.16338074
## JM36
## 0.21170152
Now with IGFR1:
## class: DESeqDataSet
## dim: 1 36
## metadata(1): version
## assays(1): counts
## rownames(1): ENSMUSG00000005533
## rowData names(0):
## colnames(36): JM1 JM2 ... JM35 JM36
## colData names(1): condition
## JM1 JM2 JM3 JM4 JM5 JM6 JM7 JM8 JM9 JM10 JM11 JM12 JM13
## 937 814 674 716 1433 1340 2097 1932 58165 78311 69573 84361 1305
## JM14 JM15 JM16 JM17 JM18 JM19 JM20 JM21 JM22 JM23 JM24 JM25 JM26
## 731 871 1095 577 698 787 499 1181 1316 1288 1156 720 711
## JM27 JM28 JM29 JM30 JM31 JM32 JM33 JM34 JM35 JM36
## 771 872 760 820 808 934 1575 1215 1602 1236
## JM1 JM2 JM3 JM4 JM5 JM6 JM7
## 32.19673 18.30306 14.27693 16.08428 26.48430 27.20641 35.06174
## JM8 JM9 JM10 JM11 JM12 JM13 JM14
## 32.74237 1211.20200 1335.79768 1200.48050 1745.31275 30.77112 20.47791
## JM15 JM16 JM17 JM18 JM19 JM20 JM21
## 16.54705 18.85586 14.16390 18.11931 18.72062 16.74342 32.98334
## JM22 JM23 JM24 JM25 JM26 JM27 JM28
## 35.11633 35.00431 40.49187 21.05071 21.06065 23.00085 19.91280
## JM29 JM30 JM31 JM32 JM33 JM34 JM35
## 23.05004 20.73759 27.69793 17.08241 45.11099 33.86567 32.71699
## JM36
## 26.16631
# original volcanoplot function
volcanoplot <- function (res, lfcthresh=1, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE,textcx=1, ...) {
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...))
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...))
if (labelsig) {
require(calibrate)
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...))
}
legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh, sep=""), "both"), pch=20, col=c("red","orange","green"))
}
# fixed version to only include FDR
volcanoplot <- function (res, lfcthresh=1, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelymin, labelxmin, labelsig=TRUE, textcx=1, ...) {
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
# with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...))
# with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...))
if (labelsig) {
require(calibrate)
# with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...))
with(subset(res, -log10(pvalue)>labelymin | abs(log2FoldChange)>labelxmin), textxy(log2FoldChange-(0.5*(log2FoldChange/abs(log2FoldChange))), -log10(pvalue), labs=Gene, cex=textcx, ...))
}
legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR < ",sigthresh,sep="")), pch=20, col=c("red"),cex=0.8)
}resorder <- res[order(res$padj), ]
## Merge with normalized count data
resdata <- merge(as.data.frame(resorder), as.data.frame(counts(ddsOK, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
resdata$GeneENS <- resdata$Gene
resdata$Gene <- res$symbol[match(resdata$Gene,rownames(res))]
resdata$Gene <- ifelse(is.na(resdata$Gene),resdata$GeneENS,resdata$Gene)
# volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-10, 10),ylim=c(0,30))
volcanoplot(resdata, labelymin = 40, labelxmin = 7, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-15, 15))## Loading required package: calibrate
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:biomaRt':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:AnnotationDbi':
##
## select
Change variables so I don’t have to change everything below… nc variables now point to Scot’s corrected variables.
library(dplyr)
res.nc <- results(dds.nc,contrast=c("condition","IGF1R","Ntg"),alpha=0.05,lfcThreshold = 0)
res.nc$symbol <- mapIds(org.Mm.eg.db,
keys=row.names(res.nc),
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")## 'select()' returned 1:many mapping between keys and columns
res.nc$symbol <- coalesce(res.nc$symbol,g$external_gene_name[match(rownames(res.nc),g$ensembl_gene_id,)])
res.nc$entrezid <- mapIds(org.Mm.eg.db,
keys=row.names(res.nc),
column="ENTREZID",
keytype="ENSEMBL",
multiVals="first")## 'select()' returned 1:many mapping between keys and columns
res.nc$refseq <- mapIds(org.Mm.eg.db,
keys=row.names(res.nc),
column="REFSEQ",
keytype="ENSEMBL",
multiVals="first")## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 11454 5016
##
## out of 16471 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 2252, 14%
## LFC < 0 (down) : 2764, 17%
## outliers [1] : 1, 0.0061%
## low counts [2] : 0, 0%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
anno.neworder <- union(which(colData(vst.nc)[, "condition"] %in% c("IGF1R","Ntg")),c(1:dim(colData(vst.nc))[1]))
mat <- assay(vst.nc)[ head(order(res.nc$padj),20), anno.neworder]
mat <- mat - rowMeans(mat)
rownames(mat) <- res.nc[match(rownames(mat),rownames(res.nc)),"symbol"]
anno <- as.data.frame(colData(vst.nc)[anno.neworder, "condition"])
rownames(anno) <- colnames(mat)
colnames(anno) <- "Condition"
anno_colors <- c(hue_pal()(2),rev(mycols_desat[3:length(mycols_desat)]))
names(anno_colors) <- unique(anno[,1])
anno_colors <- list(Condition=anno_colors)
pheatmap(mat, cluster_cols=F, annotation_col = anno, annotation_colors=anno_colors)topGenes <- rownames(res.nc)[head(order(res.nc$padj),10)]
topGenes <- res.nc$symbol[head(order(res.nc$padj),10)]
topGene <- topGenes[1]
data<-plotCounts(dds.nc, names(topGene), "condition",returnData=T)
data$count <- log2(fpm(dds.nc)[which(res.nc$symbol == topGene),])
ggplot(data, aes(x=condition, y=count, color=condition))+
geom_boxplot(alpha=0,width=0.2,show.legend=FALSE) +
scale_color_manual(values=mycols) +
#scale_y_log10() +
geom_point(position=position_jitter(width=.1,height=0))+
labs(color="Group",x="Group",y="CPM (log2 transformed)") +
ggtitle(paste(topGene,": Gene Expression")) +
ylim(c(-10,15))resorder.nc <- res.nc[order(res.nc$padj), ]
## Merge with normalized count data
resdata.nc <- merge(as.data.frame(resorder.nc), as.data.frame(counts(dds.nc, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata.nc)[1] <- "Gene"
resdata.nc$GeneENS <- resdata.nc$Gene
resdata.nc$Gene <- res.nc$symbol[match(resdata.nc$Gene,rownames(res.nc))]
# volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-10, 10),ylim=c(0,30))
volcanoplot(resdata.nc, labelymin = 40, labelxmin=5,lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-10, 10))Which genes are different between the mm10 gtf set and the one with noncode?
## character(0)
# summarizeProteinCodingGenes <- function(txdb)
# {
# stopifnot(is(txdb, "TxDb"))
# protein_coding_tx <- names(cdsBy(txdb, use.names=TRUE))
# all_tx <- mcols(transcripts(txdb, columns=c("gene_id", "tx_name")))
# all_tx$gene_id <- as.character(all_tx$gene_id)
# all_tx$is_coding <- all_tx$tx_name %in% protein_coding_tx
# tmp <- splitAsList(all_tx$is_coding, all_tx$gene_id)
# gene <- names(tmp)
# nb_tx <- unname(elementNROWS(tmp))
# nb_coding <- unname(sum(tmp))
# nb_non_coding <- nb_tx - nb_coding
# data.frame(gene, nb_tx, nb_coding, nb_non_coding, stringsAsFactors=FALSE)
# }# library(TxDb.Mmusculus.UCSC.mm10.knownGene)
# txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
# df <- summarizeProteinCodingGenes(txdb)
# head(df)# df$symbol <- mapIds(org.Mm.eg.db,
# keys=df$gene,
# column="SYMBOL",
# keytype="ENTREZID",
# multiVals="first")
# df$ensembl <- mapIds(org.Mm.eg.db,
# keys=df$gene,
# column="ENSEMBL",
# keytype="ENTREZID",
# multiVals="first")
# df[which( df$symbol == "Pik3ca"),]
# df [which(df$gene == 13221),]
# df[which(df$ensembl == "ENSMUSG00000097383"),]
# df[which(df$ensembl == "ENSMUSG00000113178"),]
# head(df)
# dim(df)other options, some errors with the one above: https://support.bioconductor.org/p/63502/ https://www.biostars.org/p/178726/
genes<-read.table("gene_info/geneInfo.txt",sep="\t",quote="\"",na.strings="-",fill=TRUE,col.names=c("GeneID","Symbol","TypeOfGene"))
# dim(genes)genes$ensembl <- mapIds(org.Mm.eg.db,
keys=as.character(genes$GeneID),
column="ENSEMBL",
keytype="ENTREZID",
multiVals="first")## 'select()' returned 1:many mapping between keys and columns
genes$ensemblSYM <- mapIds(org.Mm.eg.db,
keys=as.character(genes$Symbol),
column="ENSEMBL",
keytype="SYMBOL",
multiVals="first")## 'select()' returned 1:many mapping between keys and columns
# head(genes)
# genes[which(genes$GeneID== 13221),]
# genes[which(genes$GeneENS == "ENSMUSG00000094497"),]genes$ensemblMART <- g$ensembl_gene_id[match(genes$Symbol,g$external_gene_name,)]
# genes[which(genes$GeneID== 17902),]
# length(genes$ensembl)
# sum(is.na(genes$ensembl))
genes <- genes %>%
mutate(GeneENS = coalesce(ensembl,ensemblSYM, ensemblMART)) %>%
dplyr::select(GeneID, Symbol, TypeOfGene,GeneENS)
# genes %>%
# group_by(TypeOfGene) %>%
# summarize(k=sum(!is.na(GeneENS))/length(is.na(GeneENS)))# bplot$source <- "ensembl"
# bplot.nc$source <- "noncode"
reads.summary <- bplot
rownames(reads.summary) <- NULL
reads.summary <- cbind(reads.summary,read.table("../STAR_ALIGNMENT/summary_aligned_reads.txt"))
reads.summary$V3 <- as.numeric(sub("%","",reads.summary$V3))
# get mrna and noncode counts
ens.data <- as.data.frame(dgList$genes$GeneID)
names(ens.data) <- 'GeneENS'
ens.data$biotype <- genes$TypeOfGene[match(ens.data$GeneENS,genes$GeneENS)]
ens.mask.proteincoding <- ens.data %>% filter(biotype == 'protein-coding')
noncode.data <- as.data.frame(dgList.noncode$genes$GeneID)
names(noncode.data) <- 'GeneENS'
noncode.data$biotype <- genes$TypeOfGene[match(noncode.data$GeneENS,genes$GeneENS)]
noncode.data[str_detect(noncode.data$GeneENS,"NONMMU"),"biotype"] <- "ncRNA"
noncode.mask.proteincoding <- noncode.data %>% filter(biotype == 'protein-coding')
noncode.mask.ncrna <- noncode.data %>% filter(biotype == 'ncRNA')
# add count info
ensembl.counts <- colSums(cpm(dgList[dgList$genes$GeneID %in% ens.mask.proteincoding$GeneENS,]) > 1)
noncode.counts <- colSums(cpm(dgList.noncode[dgList.noncode$genes$GeneID %in% noncode.mask.ncrna$GeneENS,]) > 1)
reads.summary$ensembl.counts <- ensembl.counts[match(reads.summary$ids,names(ensembl.counts))]
reads.summary$noncode.counts <- noncode.counts[match(reads.summary$ids,names(noncode.counts))]
reads.summary <- reads.summary %>%
dplyr::select(group,ids, V1,V2,V3,ensembl.counts,noncode.counts) %>%
rename(sample=group,input.reads=V1, unique.reads=V2, aligned.reads.perc=V3) %>%
group_by(sample) %>%
summarize(input_reads=mean(input.reads),
unique_aligned_reads=mean(unique.reads),
perc_aligned_reads=round(mean(aligned.reads.perc),2),
counts.mrna=mean(ensembl.counts),
counts.ncrna=mean(noncode.counts)) %>%
ungroup()
write.csv(reads.summary,file="csv/reads_counts_summary.csv")# t <- genes
# t[which(t$GeneID== 17902),]
# sum(is.na(t$GeneENS))
# t[which(t$GeneENS == "ENSMUSG00000113178"),]
# t[which(t$GeneENS == "ENSMUSG00000097383"),]
# t[which(t$GeneID == 69002),]
# t[which(t$Symbol == "Igf1r"),]
# tprotcod <- t[which(t$TypeOfGene == "protein-coding"),]
# tprotcod[is.na(tprotcod$GeneENS),]resdata.nc$biotype <- genes$TypeOfGene[match(resdata.nc$entrezid,genes$GeneID)]
resdata.nc$biotype <- coalesce(resdata.nc$biotype,genes$TypeOfGene[match(resdata.nc$GeneENS,genes$GeneENS)])
resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"biotype"] <- "ncRNA"
# fix Gene and symbol columns to take NONMMU column
resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"symbol"] <- resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"GeneENS"]
resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"Gene"] <- resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"GeneENS"]
# remove genes with more than 3 NAs per row, as it means they are manual genes (no symbol, entrez or ensembl codes) (~30)
resdata.nc <- resdata.nc[rowSums(is.na(resdata.nc)) <= 3,]
# remove genes with biotype == NA, as they don't exist on ensembl (normally pseudogenes or manual entries too) (~500 out of 16000 --> 3%)
resdata.nc <- resdata.nc[!is.na(resdata.nc$biotype),]
# colSums(is.na(resdata.nc))
# resdata.nc %>% filter(is.na(biotype))
# genes %>% filter(GeneENS == 'ENSMUSG00000090673')
# head(resdata.nc)
# resdata.nc %>% mutate(biotype = forcats::fct_explicit_na(biotype)) %>% group_by(biotype) %>% summarize(k=length(is.na(Gene)))case <- "IGF1R"
biotype_colors <-c(mycols[unique(condition[condition == case])],desat(mycols[unique(condition[condition == case])],0.5))
plot_resdata.nc <- filter(resdata.nc,padj < 0.05)
plot_resdata.nc$expression <- "up"
plot_resdata.nc$expression[plot_resdata.nc$log2FoldChange < 0] <- "down"
plot_resdata.nc$expression <- factor(plot_resdata.nc$expression,levels=c("up","down"))
# df3 <- data_summary(plot_data, varname="DM", groupnames="group")
pTitle <- "Biotype and expression Barplot"
pxLab <- "Biotypes"
pyLab <- "Count"
ggplot(plot_resdata.nc,aes(biotype,fill=expression)) +
geom_bar(stat="count", width=0.8, aes(fill=expression),position=position_dodge(-.9)) +
scale_fill_manual(values=biotype_colors) +
ggtitle(pTitle) +
xlab (pxLab) +
ylab (pyLab) +
geom_text(stat='count', aes(label=..count..), hjust=-0.15,position = position_dodge(-0.9)) +
ylim(c(0,2500)) +
coord_flip() +
theme_minimal()# k <- subset(resdata,biotype=='protein-coding')
# knc <- subset(resdata.nc,biotype=='protein-coding')
# k <- subset(resdata,biotype=='ncRNA')
# knc <- subset(resdata.nc,biotype=='ncRNA')
# k <- k[k$padj < 0.05,]
# knc <- knc[knc$padj < 0.05,]
# length(knc$GeneENS %in% k$GeneENS)
# sum(knc$GeneENS %in% k$GeneENS)https://www.r-graph-gallery.com/spider-or-radar-chart.html
library(fmsb)
data <- matrix(table(plot_resdata.nc$biotype,exclude=NULL),nrow=1)
colnames(data) <- names(table(plot_resdata.nc$biotype,exclude=NULL))
data <-t(as.matrix(data[,data > 1]))
data <- as.data.frame(rbind(rep(4000,length(data)) , rep(0,length(data)) , data))
colnames(data)[is.na(colnames(data))] = "NA"
radarchart(data,axistype=1,
pcol=rgb(0.2,0.5,0.5,0.9) , pfcol=rgb(0.2,0.5,0.5,0.5) , plwd=4 ,
#custom the grid
cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,4000,1000), cglwd=0.8,
#custom labels
vlcex=0.8
)resLNC <- resdata.nc %>%
# filter(biotype != "protein-coding" & !is.na(biotype))
filter(biotype == "ncRNA")
table(resLNC$padj<0.05)##
## FALSE TRUE
## 2345 415
# kknoncode <- dds.nc[str_detect(rownames(dds.nc),"NONMMU"),]
# reskk <- results(kknoncode,contrast=c("condition","IGF1R","Ntg"),alpha=0.05,lfcThreshold = 0)
# table(reskk$padj<0.05)topGenes <- resLNC$GeneENS[head(order(resLNC$padj),10)]
# topGenes <- res$symbol[head(order(res$padj),10)]
topGene <- topGenes[3]
data<-plotCounts(dds.nc, topGene, "condition",returnData=T)
data$count <- log2(fpm(dds.nc)[which(rownames(res.nc) == topGene),])
# data$count <- log2(fpm(dds.nc)[which(res$symbol == topGene),])
ggplot(data, aes(x=condition, y=count, color=condition))+
geom_boxplot(alpha=0,width=0.2,show.legend=FALSE) +
scale_color_manual(values=mycols) +
#scale_y_log10() +
geom_point(position=position_jitter(width=.1,height=0))+
labs(color="Group",x="Group",y="CPM (log2 transformed)") +
ggtitle(paste(topGene,": Gene Expression")) +
ylim(c(-5,10))run_all_process <- function (case,control,gene_matrix=genes,volcano_x,volcano_y){
myres <- results(dds.nc,contrast=c("condition",case,control),alpha=0.05,lfcThreshold = 0)
myres$symbol <- mapIds(org.Mm.eg.db,
keys=row.names(myres),
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")
myres$symbol <- coalesce(myres$symbol,g$external_gene_name[match(rownames(myres),g$ensembl_gene_id,)])
myres$entrezid <- mapIds(org.Mm.eg.db,
keys=row.names(myres),
column="ENTREZID",
keytype="ENSEMBL",
multiVals="first")
myres$refseq <- mapIds(org.Mm.eg.db,
keys=row.names(myres),
column="REFSEQ",
keytype="ENSEMBL",
multiVals="first")
print(table(myres$padj < 0.05))
#PLOT HEATMAP
anno.neworder <- union(which(colData(vst.nc)[, "condition"] %in%
c(case,control)),c(1:dim(colData(vst.nc))[1]))
mat <- assay(vst.nc)[ head(order(myres$padj),20), anno.neworder]
mat <- mat - rowMeans(mat)
rownames(mat) <- myres[match(rownames(mat),rownames(myres)),"symbol"]
anno <- as.data.frame(colData(vst.nc)[anno.neworder, "condition"])
rownames(anno) <- colnames(mat)
colnames(anno) <- "Condition"
anno_colors <- c(hue_pal()(2),rev(mycols_desat[3:length(mycols_desat)]))
names(anno_colors) <- unique(anno[,1])
anno_colors <- list(Condition=anno_colors)
pheatmap(mat, cluster_cols=F, annotation_col = anno,
annotation_colors=anno_colors)
#PLOT BOXPLOT MAX DIFF
topGenes <- rownames(myres)[head(order(myres$padj),10)]
topGenes <- myres$symbol[head(order(myres$padj),10)]
topGene <- topGenes[!is.na(myres$symbol[head(order(myres$padj),10)])][1]
data<-plotCounts(dds.nc, names(topGene), "condition",returnData=T)
data$count <- log2(fpm(dds.nc)[which(myres$symbol == topGene),])
p <- ggplot(data, aes(x=condition, y=count, color=condition))+
geom_boxplot(alpha=0,width=0.2,show.legend=FALSE) +
#scale_y_log10() +
scale_color_manual(values=mycols) +
geom_point(position=position_jitter(width=.1,height=0))+
labs(color="Group",x="Group",y="CPM (log2 transformed)") +
ggtitle(paste(topGene,": Gene Expression")) +
ylim(c(-10,15)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 18,hjust=1),
plot.margin=margin(t = 5, r = 2, b = 5, l = 30, unit = "pt"))
plot(p)
#ORDER AND GET DATA
resorder.nc <- myres[order(myres$padj), ]
resdata.nc <- merge(as.data.frame(resorder.nc),
as.data.frame(log2(fpm(dds.nc))),
by="row.names", sort=FALSE)
names(resdata.nc)[1] <- "Gene"
resdata.nc$GeneENS <- resdata.nc$Gene
resdata.nc$Gene <- myres$symbol[match(resdata.nc$Gene,rownames(myres))]
resdata.nc$Gene <- ifelse(is.na(resdata.nc$Gene),resdata.nc$GeneENS,resdata.nc$Gene)
# VOLCANO PLOT
volcanoplot(resdata.nc, lfcthresh=1, labelymin = volcano_y, labelxmin=volcano_x, sigthresh=0.05, textcx=.8, xlim=c(-15, 15))
genes <- gene_matrix
resdata.nc$biotype <- genes$TypeOfGene[match(resdata.nc$entrezid,genes$GeneID)]
resdata.nc$biotype <- coalesce(resdata.nc$biotype,genes$TypeOfGene[match(resdata.nc$GeneENS,genes$GeneENS)])
resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"biotype"] <- "ncRNA"
resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"symbol"] <- resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"GeneENS"]
resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"Gene"] <- resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"GeneENS"]
resdata.nc <- resdata.nc[rowSums(is.na(resdata.nc)) <= 3,]
resdata.nc <- resdata.nc[!is.na(resdata.nc$biotype),]
# biotype comparison
biotype_colors <-c(mycols[unique(condition[condition == case])],adjustcolor(mycols[unique(condition[condition == case])],alpha.f=0.5))
plot_resdata.nc <- filter(resdata.nc,padj < 0.05)
plot_resdata.nc$expression <- "up"
plot_resdata.nc$expression[plot_resdata.nc$log2FoldChange < 0] <- "down"
plot_resdata.nc$expression <- factor(plot_resdata.nc$expression,levels=c("up","down"))
pTitle <- paste0("Biotype and expression - ",case," vs ",control)
pxLab <- "Biotypes"
pyLab <- "Count"
ylimmax <- round(max(table(plot_resdata.nc$expression))*1.2/5000,1)*5000
if (ylimmax == 0) {ylimmax <- 500}
p <- ggplot(plot_resdata.nc,aes(biotype,fill=expression)) +
geom_bar(stat="count", width=0.8,
aes(fill=expression),position=position_dodge(-.9)) +
ggtitle(pTitle) +
scale_fill_manual(values=biotype_colors) +
xlab (pxLab) +
ylab (pyLab) +
geom_text(stat='count', aes(label=..count..), hjust=-0.15,
position = position_dodge(-0.9)) +
ylim(c(0,ylimmax)) +
coord_flip() +
theme_minimal()
plot(p)
#PLOT RADARCHART
data <- matrix(table(plot_resdata.nc$biotype,exclude=NULL),nrow=1)
colnames(data) <- names(table(plot_resdata.nc$biotype,exclude=NULL))
data <-t(as.matrix(data[,data > 1]))
data <- as.data.frame(rbind(rep(4000,length(data)) , rep(0,length(data)) , data))
colnames(data)[is.na(colnames(data))] = "NA"
# radarchart(data,axistype=1,
# pcol=rgb(0.2,0.5,0.5,0.9) , pfcol=rgb(0.2,0.5,0.5,0.5) , plwd=4 ,
# #custom the grid
# cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,4000,1000), cglwd=0.8,
# #custom labels
# vlcex=0.8)
# return(plot_resdata.nc)
#SAVE CSV
write.csv(plot_resdata.nc,paste0("csv/raw_DE_genes_LOG2CPM_STATS_",case,"_vs_",control,".csv"))
write.csv(resdata.nc,paste0("csv/NOFILTER_raw_DE_genes_LOG2CPM_STATS_",case,"_vs_",control,".csv"))
}Prepare input, get gene ids from ensembl
library(biomaRt)
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
biom.genes <- getBM(attributes=c("ensembl_gene_id","entrezgene_id","external_gene_name","chromosome_name","start_position","end_position","description","transcript_length"),mart = mouse)
resdata.nc$entrez<-biom.genes[match(resdata.nc$GeneENS,biom.genes$ensembl_gene_id),]$entrezgene_id
resdata.nc$gene_name<-biom.genes[match(resdata.nc$GeneENS,biom.genes$ensembl_gene_id),]$external_gene_name
resdata.nc$length<-biom.genes[match(resdata.nc$GeneENS,biom.genes$ensembl_gene_id),]$transcript_lengthGet all pathways from msigdb
library(msigdbr)
raw.mouse.paths.HALLMARK = msigdbr(species = "Mus musculus",category="H")
raw.mouse.paths.HALLMARK.names <- unique(raw.mouse.paths.HALLMARK$gs_name)
pathways.MM.H <- list()
for (i in raw.mouse.paths.HALLMARK.names){
tmp <- as.list(raw.mouse.paths.HALLMARK[raw.mouse.paths.HALLMARK$gs_name == i,"entrez_gene"])
pathways.MM.H[i] <- tmp
}
pathways.MM.H <- lapply(pathways.MM.H, as.character)## Loading required package: Rcpp
gseaDat <- resdata.nc[!is.na(resdata.nc$entrezid),]
# ranks <- gseaDat$log2FoldChange
ranks <- gseaDat$stat
names(ranks) <- gseaDat$entrez
fgseaRes <- fgsea(pathways.MM.H, ranks, minSize=15, maxSize = 500, nperm=1000)## Warning in fgsea(pathways.MM.H, ranks, minSize = 15, maxSize = 500, nperm =
## 1000): There are duplicate gene names, fgsea may produce unexpected results
ggplot(fgseaRes[order(padj, -abs(NES)), ][1:20], aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.05)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways NES from GSEA") +
theme_minimal()Read results and store them
## log2 fold change (MLE): condition IGF1R vs Ntg
## Wald test p-value: condition IGF1R vs Ntg
## DataFrame with 16471 rows and 9 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSMUSG00000025900 39.1510882162175 -1.8648198076192 0.47961276878843
## ENSMUSG00000025902 214.170233947549 -0.27154097468351 0.157119761191672
## ENSMUSG00000103922 557.040660991107 -0.319471146506223 0.107966298334085
## ENSMUSG00000033845 4012.1840794926 -0.652617521741171 0.135794818224857
## ENSMUSG00000025903 766.46190214414 0.412432479724492 0.370149620473125
## ... ... ... ...
## NONMMUG096503.1 129.233893235281 0.268550017225519 0.386939700729404
## NONMMUG096512.1 87.2020704401358 -0.193228930867987 0.405147578248967
## NONMMUG096518.1 45.324632203208 -0.931174158131292 0.372790647613012
## NONMMUG096562.1 83.6755177806055 0.188858733460884 0.362402049494496
## NONMMUG096601.1 122.695011992963 -0.207239049814096 0.440647632796569
## stat pvalue padj
## <numeric> <numeric> <numeric>
## ENSMUSG00000025900 -3.88817798227099 0.000100999555284196 0.00114262425048475
## ENSMUSG00000025902 -1.72824202776285 0.0839448417110097 0.176191097614417
## ENSMUSG00000103922 -2.9589895313227 0.00308649586337376 0.0149733687392535
## ENSMUSG00000033845 -4.80590887246174 1.54050059396945e-06 4.81563938429422e-05
## ENSMUSG00000025903 1.11423180495855 0.265179754269177 0.396745885732066
## ... ... ... ...
## NONMMUG096503.1 0.694035832247986 0.487659737900373 0.610053331815286
## NONMMUG096512.1 -0.476934680698612 0.633408635357108 0.730497880003611
## NONMMUG096518.1 -2.49784742212184 0.0124949960689898 0.0432974090587549
## NONMMUG096562.1 0.521130423308361 0.602275918666113 0.70521003699921
## NONMMUG096601.1 -0.4703056010964 0.638136696345797 0.733638935419187
## symbol entrezid refseq
## <character> <character> <character>
## ENSMUSG00000025900 Rp1 19888 NM_001195662
## ENSMUSG00000025902 Sox17 20671 NM_001289464
## ENSMUSG00000103922 Gm6123 NA NA
## ENSMUSG00000033845 Mrpl15 27395 NM_001177658
## ENSMUSG00000025903 Lypla1 18777 NM_001355712
## ... ... ... ...
## NONMMUG096503.1 NA NA NA
## NONMMUG096512.1 NA NA NA
## NONMMUG096518.1 NA NA NA
## NONMMUG096562.1 NA NA NA
## NONMMUG096601.1 NA NA NA
Find FEELnc pairs in DE set
feelnc.in.myres <- feelnc[which(feelnc$partnerRNA_gene %in% resdata.nc$GeneENS),]
feelnc.pairs <- feelnc.in.myres[which(feelnc.in.myres$lncRNA_gene %in% resdata.nc$GeneENS),]
head(feelnc.pairs)check pair 1
Repeat the same but filtering first for DE genes in resdata.nc
resdata.nc.DE <- resdata.nc[resdata.nc$padj < 0.05,]
resdata.nc.DE <- resdata.nc.DE[!is.na(resdata.nc.DE$padj),]Get pairs from feelnc present in DE expressed genes
feelnc.in.myres <- feelnc[which(feelnc$partnerRNA_gene %in% resdata.nc.DE$GeneENS),]
feelnc.pairs <- feelnc.in.myres[which(feelnc.in.myres$lncRNA_gene %in% resdata.nc.DE$GeneENS),]
feelnc.pairs <- feelnc.pairs[feelnc.pairs$isBest == 1,]
feelnc.pairs$partnerRNA_symbol <- genes$Symbol[match(feelnc.pairs$partnerRNA_gene,genes$GeneENS)]
feelnc.pairs <- feelnc.pairs[,c(1,2,3,4,5,13,6,7,8,9,10,11,12)]Get logfc and calculate a “logfcPAIR”
feelnc.pairs$lfc.lncRNA <- resdata.nc.DE[match(feelnc.pairs$lncRNA_gene, resdata.nc.DE$GeneENS),]$log2FoldChange
feelnc.pairs$lfc.partnerRNA <- resdata.nc.DE[match(feelnc.pairs$partnerRNA_gene, resdata.nc.DE$GeneENS),]$log2FoldChange
feelnc.pairs$lfc.PAIR <- abs(feelnc.pairs$lfc.lncRNA) * abs(feelnc.pairs$lfc.partnerRNA)
head(feelnc.pairs)get_cor_pairs <- function (feelnc.row) {
cordata <- plotCounts(dds.nc, as.character(unlist(feelnc.row["lncRNA_gene"])), "condition",returnData=T)
#get fpm for the 2 genes
for (i in as.character(unlist(feelnc.row[c("lncRNA_gene","partnerRNA_gene")]))){
cordata[,i] <- log2(fpm(dds.nc)[match(i,rownames(dds.nc)),])
}
#filter and get only the specific case control now
cordata <- cordata[str_detect(cordata$condition, eval(expression(paste(case,control,sep="|")))),]
#separate by control and case
cordata.control <- cordata[str_detect(cordata$condition, control),]
cordata.case <- cordata[str_detect(cordata$condition, case),]
#calculate correlation
cor.control <- cor.test(cordata.control[[as.character(unlist(feelnc.row["lncRNA_gene"]))]],cordata.control[[as.character(unlist(feelnc.row["partnerRNA_gene"]))]],method="pearson")
cor.case <- cor.test(cordata.case[[as.character(unlist(feelnc.row["lncRNA_gene"]))]],cordata.case[[as.character(unlist(feelnc.row["partnerRNA_gene"]))]],method="pearson")
#return correlation R and pvalue for each case and control
return(c(as.numeric(cor.control$est), cor.control$p.value,as.numeric(cor.case$est),cor.case$p.value))
}library(reshape2)
selpairs <- feelnc.pairs[order(feelnc.pairs$lfc.PAIR,decreasing = T),] %>%
dplyr::select(lncRNA_gene,partnerRNA_gene)
selpairs <- selpairs[!duplicated(selpairs),]
selpairs <- c(rbind(as.character(selpairs$lncRNA_gene),
as.character(selpairs$partnerRNA_gene)))
selpairs <- selpairs[1:12]
# selGenes.names <- selGenes[order(selGenes$stat),3]
data<-plotCounts(dds.nc, selpairs[1], "condition",returnData=T)
for (i in selpairs){
data[,i] <- log2(fpm(dds.nc)[match(i,rownames(dds.nc)),])
# data
}
# colnames(data) <- c(colnames(data)[1:2],selGenes.names)
data <- melt(data,id.vars='condition',measure.vars=selpairs)
case <- "IGF1R"
control <- "Ntg"
data <- data[str_detect(data$condition, eval(expression(paste(case,control,sep="|")))),]
data$type <- rep(c(rep("lncRNA",each=8),rep("mRNA",each=8)),6)
data$symbol <- as.character(apply(data,1,function(x) genes$Symbol[match(x['variable'],genes$GeneENS)]))
data$symbol <- coalesce(data$symbol,as.character(data$variable))
# data$symbol <- factor(data$symbol,levels = unique(data$symbol))
# data <- data[str_detect(data$condition, "IGF1R|Ntg"),]
data$id <- rep(LETTERS[seq( from = 1, to = 12 )],each=8)ggplot(data, aes(x=id, y=value, color=condition, shape=type)) +
geom_boxplot(alpha=0.2,width=0.2,show.legend=FALSE,position=position_dodge(0.3)) +
#scale_y_log10() +
scale_color_manual(values=unique(mycols[data$condition])) +
geom_point(position=position_dodge(width=0.3),size=2.5)+
labs(x="Gene",y="CPM (log2 transformed)",shape="Gene Type",color="Sample",
subtitle=paste0(case," vs ",control)) +
scale_x_discrete(labels=data$symbol[c(1:12)*8])+
# facet_wrap( ~ variable, scales="free") +
ggtitle("DE pairs in FEELnc: lncRNA vs mRNA") +
theme_minimal () +
theme(axis.text.x = element_text(angle = 30,hjust=1), plot.margin=margin(t = 5, r = 2, b = 5, l = 40, unit = "pt")) +
geom_vline(xintercept = c(0.5,2.5,4.5,6.5,8.5,10.5,12.5),color="gray50",size=0.5)## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
# mycols <- hue_pal()(9)[as.numeric(factor(condition))]
# logcounts <- log2(counts(dds,normalized=TRUE) + 1)
# logcounts <- log2(fpm(dds.nc) + 1)
# boxplot(logcounts,
# xlab="Sample ID",
# ylab="Log2(Counts) - Norm",
# las=2, cex.axis=0.7,
# col=mycols_boxplot)
# abline(h=median(as.matrix(logcounts)), col="black")Correlation with respect to counts, cpm, log2counts and log2cpm: (In order to identify which is the best way to calculate correlation)
# selpairs <- c("NONMMUG038164.2", "ENSMUSG00000102252")
data<-plotCounts(dds.nc, selpairs[1], "condition",returnData=T)
for (i in selpairs[1:2]){
data[,i] <- log2(fpm(dds.nc)[match(i,rownames(dds.nc)),])
# data
}
data <- data[str_detect(data$condition, eval(expression(paste(case,control,sep="|")))),]data.case <- data[str_detect(data$condition, case),]
data.control <- data[str_detect(data$condition, control),]
cor.case <- cor.test(data.case[[selpairs[1]]],data.case[[selpairs[2]]],method="pearson")
cor.control <- cor.test(data.control[[selpairs[1]]],data.control[[selpairs[2]]],method="pearson")
# plot(data.control$NONMMUG087743.1,data.control$ENSMUSG00000059498)
# plot(data.case$NONMMUG087743.1,data.case$ENSMUSG00000059498)create_cor_plot <- function(pData, corcontrol, corcase, xVar,yVar,legx,legy,axis=c("title","xlab","ylab")) {
q_plot <- ggplot(pData, aes_string(x=xVar, y=yVar,color=condition))
q_plot + geom_point(aes(color=condition),size=4) +
scale_color_manual(values=unique(mycols[pData$condition])) +
geom_smooth(method=lm,se=T,aes(color=condition),alpha=0.2) +
annotate(geom="text", x=legx, y=legy,
label=paste0("Control~Data:~R^2==", round(corcontrol**2,3)), parse=T, color="black",hjust=0) +
annotate(geom="text", x=legx, y=legy-(0.1*legy),
label=paste0("Case~Data:~R^2==", round(corcase**2,3)), parse=T, color="black",hjust=0) +
ggtitle(axis[1])
}
# create_cor_plot(data.control,cor.control$est,"NONMMUG087743.1","ENSMUSG00000059498",
# min(data.control[3:4]),max(data.control[3:4]),"Correlation between lncRNA and mRNA pair - Control")
# create_cor_plot(data.case,cor.case$est,"NONMMUG087743.1","ENSMUSG00000059498",
# min(data.case[3:4]),max(data.case[3:4]),"Correlation between lncRNA and mRNA pair - Case")
create_cor_plot(data,cor.control$est,cor.case$est,selpairs[1],selpairs[2],
min(data[3:4]),max(data[3:4]),"log2CPM - Correlation between lncRNA and mRNA pair - Case")## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
data<-plotCounts(dds.nc, selpairs[1], "condition",returnData=T)
for (i in selpairs[1:2]){
data[,i] <- log2(counts(dds.nc,normalized=T)[match(i,rownames(dds.nc)),])
# data
}
data <- data[str_detect(data$condition, eval(expression(paste(case,control,sep="|")))),]
data.case <- data[str_detect(data$condition, case),]
data.control <- data[str_detect(data$condition, control),]
cor.case <- cor.test(data.case[,3],data.case[,4],method="pearson")
cor.control <- cor.test(data.control[,3],data.control[,4],method="pearson")
create_cor_plot(data,cor.control$est,cor.case$est,names(data.control)[3],names(data.control)[4],
min(data[3:4]),max(data[3:4]),"log2Counts - Correlation between lncRNA and mRNA pair - Case")## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
data<-plotCounts(dds.nc, selpairs[1], "condition",returnData=T)
for (i in selpairs[1:2]){
data[,i] <- fpm(dds.nc)[match(i,rownames(dds.nc)),]
# data
}
data <- data[str_detect(data$condition, eval(expression(paste(case,control,sep="|")))),]
data.case <- data[str_detect(data$condition, case),]
data.control <- data[str_detect(data$condition, control),]
cor.case <- cor.test(data.case[,3],data.case[,4],method="pearson")
cor.control <- cor.test(data.control[,3],data.control[,4],method="pearson")## Warning in cor(x, y): the standard deviation is zero
create_cor_plot(data,cor.control$est,cor.case$est,names(data.control)[3],names(data.control)[4],
min(data[3:4]),max(data[3:4]),"log2Counts - Correlation between lncRNA and mRNA pair - Case")## `geom_smooth()` using formula 'y ~ x'
data<-plotCounts(dds.nc, selpairs[1], "condition",returnData=T)
for (i in selpairs[1:2]){
data[,i] <- counts(dds.nc,normalized=T)[match(i,rownames(dds.nc)),]
# data
}
data <- data[str_detect(data$condition, eval(expression(paste(case,control,sep="|")))),]
data.case <- data[str_detect(data$condition, case),]
data.control <- data[str_detect(data$condition, control),]
cor.case <- cor.test(data.case[,3],data.case[,4],method="pearson")
cor.control <- cor.test(data.control[,3],data.control[,4],method="pearson")## Warning in cor(x, y): the standard deviation is zero
create_cor_plot(data,cor.control$est,cor.case$est,names(data.control)[3],names(data.control)[4],
min(data[3:4]),max(data[3:4]),"Counts - Correlation between lncRNA and mRNA pair - Case")## `geom_smooth()` using formula 'y ~ x'
–> I will use log2CPM to calculate correlation between genes!!
Run for all comparisons, as previously for DE genes
run_all_feelnc <- function (case,control,gene_matrix=genes){
myres <- results(dds.nc,contrast=c("condition",case,control),alpha=0.05,lfcThreshold = 0)
myres$symbol <- mapIds(org.Mm.eg.db,
keys=row.names(myres),
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")
myres$symbol <- coalesce(myres$symbol,g$external_gene_name[match(rownames(myres),g$ensembl_gene_id,)])
myres$entrezid <- mapIds(org.Mm.eg.db,
keys=row.names(myres),
column="ENTREZID",
keytype="ENSEMBL",
multiVals="first")
myres$refseq <- mapIds(org.Mm.eg.db,
keys=row.names(myres),
column="REFSEQ",
keytype="ENSEMBL",
multiVals="first")
print(table(myres$padj < 0.05))
#get myresults as previously, only DE genes padj < 0.05
resorder.nc <- myres[order(myres$padj), ]
resdata.nc <- merge(as.data.frame(resorder.nc),
as.data.frame(log2(fpm(dds.nc))),
by="row.names", sort=FALSE)
names(resdata.nc)[1] <- "Gene"
resdata.nc$GeneENS <- resdata.nc$Gene
resdata.nc$Gene <- myres$symbol[match(resdata.nc$Gene,rownames(myres))]
genes <- gene_matrix
resdata.nc$biotype <- genes$TypeOfGene[match(resdata.nc$entrezid,genes$GeneID)]
resdata.nc$biotype <- coalesce(resdata.nc$biotype,genes$TypeOfGene[match(resdata.nc$GeneENS,genes$GeneENS)])
resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"biotype"] <- "ncRNA"
resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"symbol"] <- resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"GeneENS"]
resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"Gene"] <- resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"GeneENS"]
resdata.nc <- resdata.nc[rowSums(is.na(resdata.nc)) <= 3,]
resdata.nc <- resdata.nc[!is.na(resdata.nc$biotype),]
resdata.nc.DE <- filter(resdata.nc,padj < 0.05)
# print(table(resdata.nc.DE$padj < 0.05))
#get feelnc pairs in my results
feelnc.in.myres <- feelnc[which(feelnc$partnerRNA_gene %in% resdata.nc.DE$GeneENS),]
feelnc.pairs <- feelnc.in.myres[which(feelnc.in.myres$lncRNA_gene %in% resdata.nc.DE$GeneENS),]
feelnc.pairs <- feelnc.pairs[feelnc.pairs$isBest == 1,]
feelnc.pairs$partnerRNA_symbol <- genes$Symbol[match(feelnc.pairs$partnerRNA_gene,genes$GeneENS)]
feelnc.pairs$lncRNA_symbol <- coalesce(as.character(genes$Symbol[match(feelnc.pairs$lncRNA_gene,genes$GeneENS)]),as.character(feelnc.pairs$lncRNA_gene))
feelnc.pairs <- feelnc.pairs[,c(1,2,14,3,4,5,13,6,7,8,9,10,11,12)]
#get lfc for each partner and calculate the pair
feelnc.pairs$lfc.lncRNA <- resdata.nc.DE[match(feelnc.pairs$lncRNA_gene, resdata.nc.DE$GeneENS),]$log2FoldChange
feelnc.pairs$lfc.partnerRNA <- resdata.nc.DE[match(feelnc.pairs$partnerRNA_gene, resdata.nc.DE$GeneENS),]$log2FoldChange
feelnc.pairs$lfc.PAIR <- abs(feelnc.pairs$lfc.lncRNA) * abs(feelnc.pairs$lfc.partnerRNA)
#calculate correlation as previously
cor.parameters <- t(apply(feelnc.pairs,1,get_cor_pairs))
colnames(cor.parameters) <- c("control.cor.R","control.cor.pval","case.cor.R","case.cor.pval")
feelnc.pairs <- cbind(feelnc.pairs,cor.parameters)
#plot selected pairs, first 6 based on lfcPAIR
selpairs <- feelnc.pairs[order(feelnc.pairs$lfc.PAIR,decreasing = T),] %>%
dplyr::select(lncRNA_gene,partnerRNA_gene)
selpairs <- selpairs[!duplicated(selpairs),]
selpairs <- c(rbind(as.character(selpairs$lncRNA_gene),
as.character(selpairs$partnerRNA_gene)))
selpairs <- selpairs[1:min(12,length(selpairs))]
data<-plotCounts(dds.nc, selpairs[1], "condition",returnData=T)
for (i in selpairs){
data[,i] <- log2(fpm(dds.nc)[match(i,rownames(dds.nc)),])
}
data <- melt(data,id.vars='condition',measure.vars=selpairs)
data <- data[str_detect(data$condition, eval(expression(paste(case,control,sep="|")))),]
if (case == "SWIM" && control != 'NONSWIM' ){
data <- data[!str_detect(data$condition,"NONSWIM"),]
}
if (control == "SWIM"){
data <- data[!str_detect(data$condition,"NONSWIM"),]
}
data$type <- rep(c(rep("lncRNA",each=8),rep("mRNA",each=8)),dim(data)[1]/16)
data$symbol <- as.character(apply(data,1,function(x) genes$Symbol[match(x['variable'],genes$GeneENS)]))
data$symbol <- coalesce(data$symbol,as.character(data$variable))
# data$symbol <- factor(data$symbol,levels = unique(data$symbol))
data$id <- rep(LETTERS[seq( from = 1, to = dim(data)[1]/8 )],each=8)
p <- ggplot(data, aes(x=id, y=value, color=condition, shape=type)) +
geom_boxplot(alpha=0.2,width=0.2,show.legend=FALSE,position=position_dodge(0.3)) +
scale_color_manual(values=unique(mycols[data$condition])) +
geom_point(position=position_dodge(width=0.3),size=2.5)+
scale_x_discrete(labels=data$symbol[c(1:12)*8])+
labs(x="Gene",y="CPM (log2 transformed)", shape="Gene Type",color="Sample",
subtitle=paste0(case," vs ",control)) +
ggtitle("DE pairs in FEELnc: lncRNA vs mRNA") +
theme_minimal () +
guides(color = guide_legend(order = 1),
shape = guide_legend(order = 2)) +
theme(axis.text.x = element_text(angle = 18,hjust=1), plot.margin=margin(t = 5, r = 2, b = 5, l = 40, unit = "pt")) +
geom_vline(xintercept = c(0.5,2.5,4.5,6.5,8.5,10.5,12.5),color="gray50",size=0.5)
plot(p)
# facet_wrap( ~ variable, scales="free") +
ggtitle("DE pairs in FEELnc: lncRNA vs mRNA") +
theme_minimal () +
theme(axis.text.x = element_text(angle = 30,hjust=1), plot.margin=margin(t = 5, r = 2, b = 5, l = 40, unit = "pt")) +
geom_vline(xintercept = c(0.5,2.5,4.5,6.5,8.5,10.5,12.5),color="gray50",size=0.5)
#Save pairs in csv file
write.csv(feelnc.pairs,paste0("csv/feelnc_pairs_",case,"_vs_",control,".csv"))
}## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 11454 5016
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 16211 259
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 14973 1497
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 15731 739
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 12166 4304
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 15828 642
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 6695 9775
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 8193 8277
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 13582 2888
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 10010 6460
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 13855 2615
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 9913 6557
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 8124 8346
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 15149 1321
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 12844 432
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
##
## FALSE TRUE
## 13789 765
Loaded packages and other parameters
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS 10.15.5
##
## Matrix products: default
## BLAS/LAPACK: /Users/sruizcarmona/miniconda3/envs/rnaseq-351/lib/R/lib/libRlapack.dylib
##
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] reshape2_1.4.4 fgsea_1.8.0
## [3] Rcpp_1.0.4.6 msigdbr_7.0.1
## [5] fmsb_0.7.0 calibrate_1.7.5
## [7] MASS_7.3-51.6 biomaRt_2.38.0
## [9] colorspace_1.4-1 dplyr_0.8.5
## [11] org.Mm.eg.db_3.7.0 AnnotationDbi_1.44.0
## [13] pheatmap_1.0.12 scales_1.1.0
## [15] cowplot_1.0.0 stringr_1.4.0
## [17] RColorBrewer_1.1-2 gplots_3.0.3
## [19] ggplot2_3.3.0 DESeq2_1.22.1
## [21] SummarizedExperiment_1.12.0 DelayedArray_0.8.0
## [23] BiocParallel_1.16.2 matrixStats_0.56.0
## [25] Biobase_2.42.0 GenomicRanges_1.34.0
## [27] GenomeInfoDb_1.18.1 IRanges_2.16.0
## [29] S4Vectors_0.20.1 BiocGenerics_0.28.0
## [31] edgeR_3.26.0 limma_3.40.0
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-147 bitops_1.0-6 bit64_0.9-7
## [4] httr_1.4.1 progress_1.2.2 tools_3.5.1
## [7] backports_1.1.6 R6_2.4.1 rpart_4.1-15
## [10] KernSmooth_2.23-17 mgcv_1.8-31 Hmisc_4.4-0
## [13] DBI_1.1.0 nnet_7.3-14 withr_2.2.0
## [16] prettyunits_1.1.1 tidyselect_1.0.0 gridExtra_2.3
## [19] curl_4.3 bit_1.1-15.2 compiler_3.5.1
## [22] htmlTable_1.13.3 labeling_0.3 caTools_1.17.1.4
## [25] checkmate_2.0.0 genefilter_1.64.0 digest_0.6.25
## [28] foreign_0.8-76 rmarkdown_2.1 XVector_0.22.0
## [31] base64enc_0.1-3 pkgconfig_2.0.3 htmltools_0.4.0
## [34] htmlwidgets_1.5.1 rlang_0.4.5 rstudioapi_0.11
## [37] RSQLite_2.2.0 farver_2.0.3 jsonlite_1.6.1
## [40] gtools_3.8.2 acepack_1.4.1 RCurl_1.98-1.2
## [43] magrittr_1.5 GenomeInfoDbData_1.2.1 Formula_1.2-3
## [46] Matrix_1.2-18 munsell_0.5.0 lifecycle_0.2.0
## [49] stringi_1.4.6 yaml_2.2.1 zlibbioc_1.28.0
## [52] epuRate_0.1 plyr_1.8.6 grid_3.5.1
## [55] blob_1.2.1 gdata_2.18.0 crayon_1.3.4
## [58] lattice_0.20-41 splines_3.5.1 annotate_1.60.1
## [61] hms_0.5.3 locfit_1.5-9.4 knitr_1.28
## [64] pillar_1.4.3 geneplotter_1.60.0 fastmatch_1.1-0
## [67] XML_3.99-0.3 glue_1.4.0 evaluate_0.14
## [70] latticeExtra_0.6-28 data.table_1.12.8 vctrs_0.2.4
## [73] gtable_0.3.0 purrr_0.3.4 assertthat_0.2.1
## [76] xfun_0.13 xtable_1.8-4 survival_3.1-12
## [79] tibble_3.0.1 memoise_1.1.0 cluster_2.1.0
## [82] ellipsis_0.3.0
A work by Baker Bioinformatics (Sergio)